#
#
# Merge 
#
#   1) JHUcomb.csv (ECG)
#   2) icd.data.oct11.2007.csv (SNP) -- cleaned version of sept so that the 
#                                        JHUIDs are in the common format
#                                        got rid of "new" or "A" or "B"
#   3) age.gender.csv (AGE.GENDER)
#   4) ReynRaceData-100207.csv (RACE)
#   5) firing.datainducibility.csv (IND)
#   6) img.data11.19.07.csv (IMAGE)
#
# Read data from file "JHUcomb.csv" in the current folder 
# which is specified under R menu->File->Chang directory. 
#
# The file is in csv format. We use the function 
# read.csv("file path"). 
#
# The "as.is=T" means not converting character variables to
# factors. read.csv() by default converts the character 
# variables (which are not converted to logical, numeric or
# complex) to factors.
#
ECG<-read.csv("JHUcomb.csv", as.is=T)
#
# Read data from "icd.data.oct.11.2007.csv" in the current
# folder. And do not convert character variables to factors.
#
SNP<-read.csv("icd.data.oct.11.2007.csv",as.is=T)
#
# Read data from "age.gender.csv" in the current
# folder. And do not convert character variables to factors.
#
AGE.GENDER<-read.csv("age.gender.csv",as.is=T)
#
# Read data from "ReynRaceData-100207.csv" in the current
# folder. And do not convert character variables to factors.
#
RACE<-read.csv("ReynRaceData-100207.csv",as.is=T)
#
# Read data from "firing.data.4.09.2008.csv" in the current
# folder. And do not convert character variables to factors.
#
IND<-read.csv( "firing.data.4.09.2008.csv",as.is=T)
#
# Read data from "img.data11.19.07.csv" in the current
# folder. And do not convert character variables to factors.
#
IMAGE<-read.csv("img.data11.19.07.csv",as.is=T)
#
# Need to create an ID variable for the ECG data
# based on the first column which is a filename.
#
# Assign N.ECG the value of number of rows in ECG dataframe.
#
N.ECG<-dim(ECG)[1]
#
# Add a column called "ID" in ECG dataframe. Assign all ""s
# to the column.
#
# rep("",N.ECG) means generating a vector by repeating "" 
# N.ECG times.
#
ECG$ID<-rep("",N.ECG)
#
# Do a loop to every row in N.ECG dataframe. In each row, if
# the first column is a "NA", then assign the ID column "NA".
# Otherwise, if ECG is string and its first charactor is a "."
# then assign ID column the value of "JHU"+the substring of
# the first column (from 7th character to the 9th character)
# and without any separation mark.
# If it is not a ".", then do the same assignment except using
# the string from 4th character to the 6th character.
#
for (i in seq(1,N.ECG))
{ 
  #
  # is.na() detect whether variable is NA or not. It returns 
  # either TRUE or FALSE.
  #
  # ECG[i,1] means the element in the ith row and 1st column.
  #
  if (is.na(ECG[i,1]))
  {
     # assign the element in ith row and ID column the value of NA
     #
     ECG$ID[i]<-NA
  }
  else
  {
     #
     # Determine whether the 1st character of the string in ith row
     # and 1st column is a "."
     #
     # == is an operator to detect whether both sides are equal
     # It returns a logical value of TRUE or FALSE.
     #
     if (substring(ECG[i,1],1,1)==".")
     {
        # assign the element in ith row and ID column a string, which
        # comprise "JHU" and from 7th to 9th characters in the string
        # in the element in ith row and 1st column.
        #
        ECG$ID[i]<-paste("JHU",substring(ECG[i,1],7,9),sep="")
     }
     else
     {  
        # assign the element in ith row and ID column a string, which
        # comprise "JHU" and from 4th to 6th characters in the string
        # in the element in ith row and 1st column.
        #
        ECG$ID[i]<-paste("JHU",substring(ECG[i,1],4,6),sep="")
     }
  }
}
#
# Recode the -999's in ECG as NA's
#
# Look through each row and each column to see if there is a -999 and
# replace it with NA.
#
# Look through each row in ECG.
#
for (i in seq(1,dim(ECG)[1]))
{
   # Look through each column.
   for (j in seq(1,dim(ECG)[2]))
   {
      # Determine whether the current cell is not a NA.
      #
      # ! is an operator for "not". For example. "!(1==2)" is TRUE
      if (!is.na(ECG[i,j]))
      {
         # Determine whether the current cell is -999, if TRUE, then 
         # assign the current cell a NA.
         if (ECG[i,j]==-999)
         {
            ECG[i,j]<-NA
         }
      }
   }  
}
#
# Make all SNP calls that equal ERROR, UNDETERMINED or - into NA's
#
# Look through each row and in column 2 to 7, find all the cells that are 
# ERROR, UNDETERMINED or -, replace them with NA's.
#
# create indicator for all SNP's having been called
#
# Look through each row
#
for (i in seq(1,dim(SNP)[1]))
{
   # Look through each element from 2nd column to 7th column.
   for (j in seq(2,7))
   {
     # Deter whether the current cell is ERROR, UNDETERMINED or -, if TRUE, 
     # then rewrite it as a NA.
     if ((SNP[i,j]=="UNDETERMINED")||(SNP[i,j]=="-")||(SNP[i,j]=="ERROR")) 
     {
        SNP[i,j]<-NA
     }
   }  
}
# Rename the SNP columns as "ID", "snp1", "snp2", "snp3", "snp4", "snp5"
# and "snp6".
#
names(SNP)<-c("ID","snp1","snp2","snp3","snp4","snp5","snp6")
#
# Recode blank gender as NA
# 
# Look through each row in column Gender in AGE.GENDER dataframe. Replace
# all the blank cells with NA's.
#
for (i in seq(1,dim(AGE.GENDER)[1]))
{
   # Determine whether the current cell is blank. If so, then assign it a NA.
   if (AGE.GENDER$Gender[i]=="")
   {
      AGE.GENDER$Gender[i]<-NA
   }
}
#
# Recode RACE as NA if it is not A, B, W or O
#
# Look through the column Race in data frame RACE.
#
for (i in seq(1,dim(RACE)[1]))
{
   # Determine if the current cell is A, B, W or O, if not, assign it a NA.
   if ((RACE$Race[i]!="A")&&
       (RACE$Race[i]!="B")&&
       (RACE$Race[i]!="W")&&
       (RACE$Race[i]!="O"))
         {
           RACE$Race[i]<-NA
         }
}
#
# Rename the 1st column (PID variable) of data frame SNP, AGE.GENDER,
# RACE as ID. 
#
names(AGE.GENDER)[1]<-"ID"
names(RACE)[1]<-"ID"
names(SNP)[1]<-"ID"
#
# Rename the IND Study.ID variable as ID 
#
names(IND)[1]<-"ID"
#
#
# Rename the IMAGE ReynoldsNum variable as ID 
#
names(IMAGE)[1]<-"ID"
#
# Clean the inducibility data so that
# (a) the ID's don't have trailing -I
# (b) the phenotype is either yes, no or NA 
#
# Note that IND$Inducible is the variable telling us if we have
# inducible data (1) or not (0)
#
# Assign L the number of rows in data frame IND
#
L<-dim(IND)[1]
#
# 
#
for (i in seq(1,L))
{
   #
   # fix the ID by extracting the first 6 characters
   #
   IND$ID[i]<-substr(IND$ID[i],1,6)
   # Determine if the Inducible variable is a NA
   if (!is.na(IND$Inducible[i]))
   {
      # Determine if the Inducible variable is "no" or "yes"
      if ((IND$Inducible[i]!="no")&&(IND$Inducible[i]!="yes"))
      {  
         # If the Inducible variable is "no, ", then we change it to 
         # "no".
         if (IND$Inducible[i]=="no, ")
         {
            IND$Inducible[i]<-"no"
         }
         # For all other cases, we assign it a NA.
         else
         {
            IND$Inducible[i]<-NA 
         }
      }
   }
}
#
# Create indicators that just tell us if an ID is in a dataset
#
# Add a column in SNP named IDIN.SNP.IND with all 1s.
#
SNP$IDIN.SNP.IND<-rep(1,dim(SNP)[1])
#
# Add a column in ECG named IDIN.ECG.IND with all 1s.
#
ECG$IDIN.ECG.IND<-rep(1,dim(ECG)[1])
#
# Add a column in AGE.GENDER named IDIN.AGE.GENDER.IND with all 1s.
#
AGE.GENDER$IDIN.AGE.GENDER.IND<-rep(1,dim(AGE.GENDER)[1])
#
# Add a column in RACE named IDIN.RACE.IND with all 1s.
#
RACE$IDIN.RACE.IND<-rep(1,dim(RACE)[1])
#
# Add a column in IND named IDIN.IND.IND with all 1s.
#
IND$IDIN.IND.IND<-rep(1,dim(IND)[1])
#
# Add a column in IMAGE named IDIN.IMAGE.IND with all 1s.
#
IMAGE$IDIN.IMAGE.IND<-rep(1,dim(IMAGE)[1])
#
# Add a column in IMAGE named IMAGE.IND with all 1s.
#
IMAGE$IMAGE.IND<-rep(1,dim(IMAGE)[1])
#
# Merge ECG and SNP data frames together by the common column ID, and name
# it d1.
# Extra rows will be added to the output for each row in x that has no
# matching row in y. These rows will have NAs in those columns that are
# usually filled with values from y.
#
d1<-merge(ECG,SNP,by.x="ID",by.y="ID",all=TRUE)
#
# Merge d1 and AGE.GENDER data frames together by the common column ID,
# and name it d2.
#
d2<-merge(d1,AGE.GENDER,by.x="ID",by.y="ID",all=TRUE)
#
# Merge d2 and RACE data frames together by the common column ID, and
# name it d3.
#
d3<-merge(d2,RACE,by.x="ID",by.y="ID",all=TRUE)
#
# Merge d3 and IND data frames together by the common column ID, and
# name it d4.
#
d4<-merge(d3,IND,by.x="ID",by.y="ID",all=TRUE)
#
# Merge d4 and IMAGE data frames together by the common column ID, and
# name it d5.
#
d5<-merge(d4,IMAGE,by.x="ID",by.y="ID",all=TRUE)
#
# Rename d5 as d.
#
d<-d5
# remove the variable d1, d2, d3, d4, d5.
rm(d1)
rm(d2)
rm(d3)
rm(d4)
rm(d5)
#
# Create indicators of data available
#
# If all the cells in the same row in column snp1, snp2, snp3, snp4, snp5
# snp6 are not missing value (NA), then assign the indicator TRUE. Otherwise,
# assign indicator FALSE. Name the indicator SNP.ALL.IND.
#
d$SNP.ALL.IND<-complete.cases(d$snp1,d$snp2,d$snp3,d$snp4,d$snp5,d$snp6)
#
# If the cell in column QTVI_log is not missing value (NA), then assign
# the indicator TRUE. Otherwise, assign indicator FALSE. Name the 
# indicator ECG.IND.
#
d$ECG.IND<-complete.cases(d$QTVI_log)
#
# If the cell in column Birth.Year.x is not missing value (NA), then assign
# the indicator TRUE. Otherwise, assign indicator FALSE. Name the indicator
# AGE.IND.
#
d$AGE.IND<-complete.cases(d$Birth.Year.x)
#
# If the cell in column Gender is not missing value (NA), then assign
# the indicator TRUE. Otherwise, assign indicator FALSE. Name the indicator
# GENDER.IND.
#
d$GENDER.IND<-complete.cases(d$Gender.x)
#
# If the cell in column Gender is not missing value (NA), then assign
# the indicator TRUE. Otherwise, assign indicator FALSE. Name the indicator
# RACE.IND.
#
d$RACE.IND<-complete.cases(d$Race)
#
# If the cell in column Inducible is not missing value (NA), then assign
# the indicator TRUE. Otherwise, assign indicator FALSE. Name the indicator
# IND.IND.
#
d$IND.IND<-complete.cases(d$Inducible)
#
# If the cell in column DEmass is not missing value (NA), then assign
# the indicator TRUE. Otherwise, assign indicator FALSE. Name the indicator
# IMAGE.IND.
#
d$IMAGE.IND<-complete.cases(d$DEmass)
#
# Filter out the non-adults
#
# 
#
# Set missing birth years to zero
#
d$Birth.Year.x[is.na(d$Birth.Year.x)]<-0
d$Birth.Year.y[is.na(d$Birth.Year.y)]<-0
#
# Create a new Birth.Year variable: 
#    if Birth.Year.x is missing, take Birth.Year.y, otherwise take 
#    Birth.Year.x
#
d$Birth.Year<-d$Birth.Year.x+d$Birth.Year.y*(d$Birth.Year.x==0)
#
# Keep only those born before 1995.
#
# d[d$Birth.Year<=1995,] means all rows in d that have Birth.Year less or 
# equal to 1995.
#
d<-d[d$Birth.Year<=1995,]
#
# Convert firing & implant dates to date format and create an indicator for
# implantation.
#
# If Implant.Date is not a NA, assign TRUE to IMPLANT.IND. Otherwise, assign
# FALSE. 
# 
d$IMPLANT.IND<-complete.cases(d$Implant.Date)
#
# Convert Firings to another date formate (eg. "09/21/2008") and name it
# Firing.Date.
#
d$Firing.Date<-as.Date(d$Firings,format="%m/%d/%Y")
#
# Convert Implant.Date to another date formate (eg. "09/21/2008") and still
# name it Implant.Date.
#
d$Implant.Date<-as.Date(d$Implant.Date,format="%m/%d/%Y")
#
# Calculate the number of TRUEs in column IMPLANT.IND
#
sum(d$IMPLANT.IND)
#
# Calculate the number of NAs in column IMPLANT.IND
#
sum(!is.na(d$Implant.Date))
#
# Calculate the days between Firing.Date and Implant.Date, assign it to
# Days.To.Firing
#
d$Days.To.Firing<-d$Firing.Date-d$Implant.Date
#
# Create a indicator FIRED.IND to show the NAs in column Days.To.Firing.
#
d$FIRED.IND<-!is.na(d$Days.To.Firing)
#
# Compute days to today, assuming today is March 4, 2008
#
today<-as.Date("3/04/2008",format="%m/%d/%Y")
d$Days.Of.Implant<-today-d$Implant.Date
#
# Create a indicator for AP.vs.IAP. If AP.vs.IAP is "AP", then assign the
# indicator TRUE, otherwise, assign FALSE.
#
d$APP.FIRED.IND<-(d$AP.vs.IAP=="AP")
#
#
# Write data frame d to a csv file
#
write.csv(d,file="data.csv",row.names=F)
#
#
# Make data frame of those for which we have inducibility data
#
dind<-d[d$IND.IND,]
#
#
# Write data frame "dind" to a csv file
#
write.csv(dind,file="data.ind.csv",row.names=F)
#
# SANCAR - Don't need to look past here but I used the following 
# stuff to get the Venn diagram reconciled with Bruno a while back
#
#
# Counts of ID's appearing in a list of datasets.
#
# Make clinical indicators
#
d$CLINICAL.IND<-(d$AGE.IND)*(d$RACE.IND)*(d$GENDER.IND)
dind$CLINICAL.IND<-(dind$AGE.IND)*(dind$RACE.IND)*(dind$GENDER.IND)
#
#
#





